##################################################################################################
# Sentiment Analysis of SMD Data with Lexicoder
##################################################################################################
# Description:
##################################################################################################
# Script to get sentiment from all texts from news papers in the smd data from 2019.
# including several descriptive plots around the sentiment regarding the polls, language and 
# different newspapers
# We use a method proposed by Proksch et al. (2018) which is very well established and works for many languages
##################################################################################################
# Content:
##################################################################################################
# 1) Dependencies
# 2) Data Import
# 3) Pre-Processing of Text
# 4) Sentiment Anslysis
# 5) Data Transformations for Descriptives
# 6) Descriptives
##################################################################################################
# 1) Dependencies: 
rm(list=ls())
##################################################################################################
library(tm)
library(tokenizers)
library(dplyr)
library(tidyr)
library(stringi)
library(stringr)
library(readr)
library(quanteda)
library(pbmcapply)
##################################################################################################
# 2) Data Import
##################################################################################################
# Load Sentiment Dictionaries from Proksch et al. (2018)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
getwd()
# Load Dictionary from Proksch et al. Multilingual Sentiment analysis (based on the Lexicoder Dictionary)
load("../input/auto_dictionaries_lsd.RData")
dictionaries <- load("../input/auto_dictionaries_lsd.RData")

# Load improved Lexicons
load("../input/lsde_frenche_germane.RData")
dictionaries_2 <- load("../input/lsde_frenche_germane.RData")

#Cobine Vector
dictionaries <- c(dictionaries, dictionaries_2)
rm(dictionaries_2)
gc()
# Load Full Classified SMD Dataset
df_classify <- readRDS("../input/SMD_Text_Classified_Weekly.RDS")
##################################################################################################
# 3) Pre-Processing 
##################################################################################################
# Test Sample 
if(F){
  set.seed(12345)
  df_classify <- df_classify[sample(nrow(df_classify), 10000), ]
}

# Get Sentences in order:
tx_new <- gsub('(?<=[a-z])\\.(?=[A-Z\\(])', '. ', df_classify$tx, perl = T)
# Remove URL's
tx_new <- gsub('http\\S+\\s*', '', tx_new, perl = T)
# Remove Numbers:
tx_new <- gsub('http[[:alnum:]]*', '', tx_new, perl = T)
# Remove leading and trailing whitespaces:
tx_new <- gsub('^[[:space:]]*', '', tx_new, perl = T)
tx_new <- gsub('[[:space:]]*$', '', tx_new, perl = T)
# Remove weired quotation marks and other signs quanteda does not like:
tx_new <- gsub('|\\»|\\«|\\}|\\{', '', tx_new)
tx_new <- gsub('\\–', '. ', tx_new)
tx_new <- gsub('\\{|\\}|\\[|\\]|\\(|\\)', '', tx_new)

df_classify$text <- as.character(tx_new)
rm(tx_new)

# Check for languages:
df_classify %>% group_by(la) %>% summarise(n = n())

##################################################################################################
# 4) Sentiment Analysis
##################################################################################################
#Sentiment Calculator:
final_dat <- function(data, dictionarieslist = dictionaries, nodes = 2){
  quanteda_options(threads = nodes)
  cat("Make sure you have loaded the 'auto_dictionaries_lsd.RData' and 'lsde_frenche_germane.RData' in the environment.
      \nWithout them the function will not work\n")
  #########################################################
  # Add Sentiment Column:
  data$sentiment_value <- NA
  # Filter Data from Texte with Languages we don't are about:
  data_rest <- data %>% filter(!la %in% c("de", "fr", "it"))
  data <- data %>% filter(la %in% c("de", "fr", "it"))
  # Split data by Languages:
  data_de <- data %>% filter(la == "de")
  data_fr <- data %>% filter(la == "fr")
  data_it <- data %>% filter(la == "it")
  #Clear Mem:
  rm(data)
  gc()
  #########################################################
  # Allocate Memory
  numde <- nrow(data_de)
  numfr <- nrow(data_fr)
  numit <- nrow(data_it)
  
  ################
  # Process German
  protex <- data_de[,'text']
  
  dict_lang_pos_neg <-  get(paste0("extendeddict_de"))
  dict_lang_pos <- dict_lang_pos_neg[[1]]
  dict_lang_neg <- dict_lang_pos_neg[[2]]
  
  #Calculate Sentiment of Text: 
  sentisave <- convert(dfm(protex, remove_punct = T, remove_numbers = T, 
                           dictionary = dict_lang_pos_neg), to = "data.frame")
  senttex <- log((sentisave$pos+0.5)/(sentisave$neg+0.5))
  senttex[senttex == numeric(0)] = 0
  
  #Add Value to Data     
  data_de$sentiment_value <- senttex
  ################                 
  # Process French
  protex <- data_fr[,'text']
  
  dict_lang_pos_neg <-  get(paste0("extendeddict_fr"))
  dict_lang_pos <- dict_lang_pos_neg[[1]]
  dict_lang_neg <- dict_lang_pos_neg[[2]]
  
  #Calculate Sentiment of Text: 
  sentisave <- convert(dfm(protex, remove_punct = T, remove_numbers = T, 
                           dictionary = dict_lang_pos_neg), to = "data.frame")
  senttex <- log((sentisave$pos+0.5)/(sentisave$neg+0.5))
  senttex[senttex == numeric(0)] = 0
  
  #Add Value to Data     
  data_fr$sentiment_value <- senttex
  ################                 
  # Process Italian
  protex <- data_it[,'text']
  
  dict_lang_pos_neg <-  get(paste0("extendeddict_it"))
  dict_lang_pos <- dict_lang_pos_neg[[1]]
  dict_lang_neg <- dict_lang_pos_neg[[2]]
  
  #Calculate Sentiment of Text: 
  sentisave <- convert(dfm(protex, remove_punct = T, remove_numbers = T, 
                           dictionary = dict_lang_pos_neg), to = "data.frame")
  senttex <- log((sentisave$pos+0.5)/(sentisave$neg+0.5))
  senttex[senttex == numeric(0)] = 0
  
  
  #Add Value to Data     
  data_it$sentiment_value <- senttex
  
  # Combine all frames again:
  data <- rbind(data_de,data_fr,data_it,data_rest)
  rm(data_de,data_fr,data_it,data_rest)
  gc()
  return(data)
}

df <- final_dat(df_classify, nodes = 6)
##################################################################################################
# 5) Data Transformation
##################################################################################################
df$sentiment_value <- as.numeric(df$sentiment_value)
df$text <- NULL
setwd("/home/mkubli/NER/input")
getwd()

df$sentiment_value <- ifelse(is.na(df$sentiment_value) == T, 0, df$sentiment_value)

saveRDS(df, "SMD_Text_Classified_Sentiment_Weekly.RDS")
##################################################################################################
# 6) Descriptives
##################################################################################################

summary(df$sentiment_value)
